# Linear Models with Outliers: Choosing between Conditional-Mean and Conditional-Median Methods
# State Politics & Policy Quarterly 11(4).
# Jeffrey J. Harden (jjharden@unc.edu) and Bruce A. Desmarais (desmarais@polsci.umass.edu)
#
# R Functions to perform the CVDM test
# See the Web Appendix for an example of how to use the CVDM() function
# Note: This version also performs the test for Robust Regression (RR), which is regression with a t(4) error term
###############################################################################

### Functions for the Computation of Vuong and the Cross-Validated Johnson's t-test with OLS, MR, and RR ###

## Compute Skewness ##

mu3hat <- function(x){
	n <- length(x)
	ns <- n*1/(n-1)*1/(n-2)
	ns*sum((x-mean(x))^3)
}

## Johnson's t ##

johnsons.t <- function(x){
	m3 <- mu3hat(x)
	s <- sd(x)
	n <- length(x)
	(mean(x)+m3/(6*s^2*n)+m3/(3*s^4)*mean(x)^2)*sqrt(n)/s
}

## Centered Laplace Density ##

dlapl <- function(x,b){
	return(1/(2*b)*exp(-abs(x/b)))
}

# RR Psi function
psi.t <- function(u,deriv=0){
	v <- 4
	if(deriv==0){
		return((v+1)/(v+u^2))
	}
	if(deriv==1){
		return(1/(v+u^2)-(v+1)/(v+u^2)^2)
	}
}

# t density function
dst <- function(x,sig,df){
	f <- gamma((df+1)/2)/(sig*sqrt(df*pi)*gamma(df/2))*((df+x^2/sig^2)/df)^(-(df+1)/2)
	return(f)
}

desingular <- function(x,y){
	inv <- try(solve(t(x)%*%x))
	if(!is.matrix(inv)){
		sdy <- sd(y)
		for(i in 1:ncol(x)){
			sdx <- sd(x[,i])
			if(sdx > 0) x[,i] <- x[,i]+rnorm(length(x[,i]),sd=sdx/10000)
			if(sdx == 0) x[,i] <- x[,i]+rnorm(length(x[,i]),sd=sdy/10000)
		}
	}
	return(x)
}

## lls for MR and OLS ##

ll2 <- function(formula,data){
	ls <- lm(formula,data=data)
	mr <- rq(formula,data=data)
	sig <- summary(ls)$sigma
	b <- mean(abs(residuals(mr)))
	ll_ls <- dnorm(residuals(ls),sd=sig,log=T)
	ll_mr <- log(dlapl(residuals(mr),b=b))
	return(list(LS=ll_ls,MR=ll_mr))
}

## cvlls for MR and OLS ##

cvll2 <- function(formula,data){
	require(MASS)
	require(quantreg)
	est <- lm(formula,data=data,x=T,y=T)
	x <- est$x
	y <- est$y
	cvll.ls <- numeric(length(y))
	cvll.mr <- numeric(length(y))
	for(i in 1:length(y)){
		yt <- y[-i]
		xt <- desingular(x[-i,],yt)
		yv <- y[i]
		xv <- x[i,]
		ls <- lm(yt~-1+xt)
		mr <- rq(yt~-1+xt)
		sig <- summary(ls)$sigma
		b <- mean(abs(residuals(mr)))
		cvll.ls[i] <- dnorm(yv-rbind(xv)%*%coef(ls),sd=sig,log=T)
		cvll.mr[i] <- log(dlapl(yv-rbind(xv)%*%coef(mr),b=b))
	}
	return(list(LS=cvll.ls,MR=cvll.mr))
}

# Individual LL function
ll3 <- function(formula,data,method){
    ls <- lm(formula,data=data)
    mr <- rq(formula,data=data)
    rr <- rlm(formula,data=data,psi=psi.t,method=method,maxit=100)
    sig <- summary(ls)$sigma
    b <- mean(abs(residuals(mr)))
    scale <- rr$s
    ll_ls <- dnorm(residuals(ls),sd=sig,log=T)
    ll_mr <- log(dlapl(residuals(mr),b=b))
    ll_rr <- log(dst(residuals(rr),sig=scale,df=4))
    return(list(LS=ll_ls,MR=ll_mr,RR=ll_rr,ls.est=ls,mr.est=mr,rr.est=rr))
}


# Individual CVLL function
cvll3 <- function(formula,data,method){
    est <- lm(formula,data=data,x=T,y=T)
    x <- est$x
    y <- est$y
    cvll.ls <- numeric(length(y))
    cvll.mr <- numeric(length(y))
    cvll.rr <- numeric(length(y))
    for(i in 1:length(y)){
        yt <- y[-i]
        xt <- desingular(x[-i,],yt)
        yv <- y[i]
        xv <- x[i,]
        ls <- lm(yt~-1+xt)
        mr <- rq(yt~-1+xt)
        rr <- rlm(yt~-1+xt,psi=psi.t,method=method,maxit=100)
        sig <- summary(ls)$sigma
        b <- mean(abs(residuals(mr)))
        scale <- rr$s
        cvll.ls[i] <- dnorm(yv-rbind(xv)%*%coef(ls),sd=sig,log=T)
        cvll.mr[i] <- log(dlapl(yv-rbind(xv)%*%coef(mr),b=b))
        cvll.rr[i] <- log(dst(yv-rbind(xv)%*%coef(rr),sig=scale,df=4))
     		cat("Completed observation", i, "of", length(y), "\n")
    }
    return(list(LS=cvll.ls,MR=cvll.mr,RR=cvll.rr,p=length(coef(est)),n=length(est$y)))
} 

## The function for computing the CVJT ##
# Takes formula and data frame arguments
# Returns cvjt and vuong -- respective t-statistics, can be used as t or z stats.
# both tests are fit(OLS)-fit(MR), such that negative values suport MR

CVDM <- function(formula,data,method = method){
	require(quantreg)
	require(MASS)
	lls <- ll3(formula,data,method)
	cvlls <- cvll3(formula,data,method)
	lld.LSvMR <- lls$LS - lls$MR
	cvlld.LSvMR <- cvlls$LS-cvlls$MR
	lld.LSvRR <- lls$LS - lls$RR
	cvlld.LSvRR <- cvlls$LS-cvlls$RR
	jt1 <- johnsons.t(cvlld.LSvMR)
	jt2 <- johnsons.t(cvlld.LSvRR)
	vuong1 <- t.test(lld.LSvMR)
	vuong2 <- t.test(lld.LSvRR)
	return(list(cvdm.LSvMR = jt1, cvdm.LSvMR.p = pnorm(abs(jt1), lower.tail = FALSE), cvdm.LSvRR = jt2, cvdm.LSvRR.p = pnorm(abs(jt2), lower.tail = FALSE), vuong.LSvMR=vuong1$statistic, vuong.LSvMR.p = vuong1$p.value, vuong.LSvRR=vuong2$statistic, vuong.LSvRR.p = vuong2$p.value))
}

# For robust SEs
cluster.se <- function(model, cluster){ # This one displays the SEs
         M <- length(unique(cluster))
         N <- length(cluster)           
         K <- model$rank
         dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
         uj  <- apply(estfun(model), 2, function(x) tapply(x, cluster, sum));
         rcse.cov <- dfc * sandwich(model, meat = crossprod(uj)/N)
         rcse.se <- coeftest(model, rcse.cov)
         return(list(rcse.cov = rcse.cov, rcse.se = rcse.se))
        }


